*//////////////////////////////////////////////////////////////////////////////
*                                                                             /         
*				High-Pressure, High-Paying Jobs?                              /
*               The Review of Economics and Statistics                        /
*                                                                             /
*               Authors: Markus Nagler, Johannes Rincke, Erwin Winkler        /
*                                                                             /
*               Do-File 5: Random forest prediction of health index           /
*                                                                             /
*               This file generates:                                          /
*               - Figure 1, panel (b)                                         /
*                                                                             /
*                                                                             /
*//////////////////////////////////////////////////////////////////////////////


use "$data\bibb_2018_processed.dta", replace

label var d_codifiable "Codifiability of job"
label var d_routine "Routine-intensity of job"
label var female "Gender"
label var high_edu "University degree"
label var med_edu "Vocational degree"
label var number_sub "Number of subordinates"
label var german "German nationality"
label var temp_contract "Temporary contract"
label var pressure "Pressure index"

*ssc install rforest

*////////////////////////////////////////
*                                       /
* Construct health index using PCA      /
*                                       /
*////////////////////////////////////////

pca nosleep tired nervous exhausted_mental exhausted_phys d_taxing d_relax toomuchwork stress_increase d_limit
predict health_index
drop if missing(health_index)
sum health_index 
replace health_index = (health_index-r(mean))/r(sd)
sum health_index



*///////////////////////////////////////////////////////////////////
*                                                                  /
* Randomly split dataset into training and test data (50-50)       /
*                                                                  /
*///////////////////////////////////////////////////////////////////

clear matrix
set seed 141015

gen rvar = uniform()
gen training=0
gen test=0
sum rvar, detail
replace training=1 if rvar<=r(p50)
replace test=1 if rvar>r(p50)
sort test


set seed 141015 
gen u = uniform()
sort u








*///////////////////////
*                      /
*   FINAL ESTIMATION   /
*                      /
*///////////////////////

preserve

rforest health_index pressure high_edu med_edu age female german works_council temp_agency commute temp_contract firmsize normal_workhours shift standby number_sub d_routine d_codifiable computer phys_index if test==1, type(reg) iterations(3000) numvars(4) lsize(25) seed(141015)


matrix list e(importance)

matrix importance = e(importance)
svmat importance
list importance in 1/5
gen id=""


        local mynames : rownames importance
        local k : word count `mynames'
            // If there are more variables than observations
            if `k'>_N {
                set obs `k'
            }
            forvalues i = 1(1)`k' {
                local aword : word `i' of `mynames'
                local alabel : variable label `aword'
                if ("`alabel'"!="") qui replace id= "`alabel'" in `i'
                else qui replace id= "`aword'" in `i'
            }

graph hbar (mean) importance, over(id, sort(1)) ytitle(Prediction of health index: variable importance) yscale(range(0 1)  titlegap(4))


restore

















